In [393]:
from IPython.display import display, HTML
display(HTML("<style>.jp-Notebook {width: 70% !important; margin: auto !important;}</style>"))
Table of Contents:¶
Exploratory Data Analysis ¶
In [394]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
In [395]:
df = pd.read_csv('data.csv')
In [396]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 33 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 569 non-null int64 1 diagnosis 569 non-null object 2 radius_mean 569 non-null float64 3 texture_mean 569 non-null float64 4 perimeter_mean 569 non-null float64 5 area_mean 569 non-null float64 6 smoothness_mean 569 non-null float64 7 compactness_mean 569 non-null float64 8 concavity_mean 569 non-null float64 9 concave points_mean 569 non-null float64 10 symmetry_mean 569 non-null float64 11 fractal_dimension_mean 569 non-null float64 12 radius_se 569 non-null float64 13 texture_se 569 non-null float64 14 perimeter_se 569 non-null float64 15 area_se 569 non-null float64 16 smoothness_se 569 non-null float64 17 compactness_se 569 non-null float64 18 concavity_se 569 non-null float64 19 concave points_se 569 non-null float64 20 symmetry_se 569 non-null float64 21 fractal_dimension_se 569 non-null float64 22 radius_worst 569 non-null float64 23 texture_worst 569 non-null float64 24 perimeter_worst 569 non-null float64 25 area_worst 569 non-null float64 26 smoothness_worst 569 non-null float64 27 compactness_worst 569 non-null float64 28 concavity_worst 569 non-null float64 29 concave points_worst 569 non-null float64 30 symmetry_worst 569 non-null float64 31 fractal_dimension_worst 569 non-null float64 32 Unnamed: 32 0 non-null float64 dtypes: float64(31), int64(1), object(1) memory usage: 146.8+ KB
All the relevant features are floats and there is an unnecessary feature called "Unnamed: 32" that seems to appear due to this issue with read_csv(): https://www.kaggle.com/discussions/general/354943
In [397]:
df.drop(['id', 'Unnamed: 32'], axis=1, inplace=True)
df.head()
Out[397]:
| diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | ... | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
| 1 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | ... | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
| 2 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | ... | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
| 3 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | ... | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
| 4 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | ... | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
5 rows × 31 columns
We drop the "id" and "Unnamed: 32" columns since thery are irrelevant.
In [398]:
def diagnosis_value(diagnosis):
return 1 if diagnosis == 'M' else 0
Encode the target feature numerically.
In [399]:
df['diagnosis'] = df['diagnosis'].apply(diagnosis_value)
df.head()
Out[399]:
| diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | ... | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
| 1 | 1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | ... | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
| 2 | 1 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | ... | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
| 3 | 1 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | ... | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
| 4 | 1 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | ... | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
5 rows × 31 columns
Applied the numerical encoding of the target variable to the dataset.
In [400]:
df.isnull().values.any()
Out[400]:
np.False_
There are no null values.
In [401]:
# Plot histograms for each feature
df.hist(bins=15, figsize=(20, 15), layout=(6, 6))
plt.tight_layout()
plt.show()
The dataset is slightly imbalanced.
Most features seem to follow a normal distribution (perhaps with a positive skewness).
For the rest it's feasible to apply a log-transform.
In [402]:
skewness_arr = df.skew().sort_values(ascending=False)
skewness_arr = skewness_arr[skewness_arr > 2]
print(skewness_arr)
skewness_arr = skewness_arr.index.tolist()
# Add 1 to every skewed data since log(0) is undefined and log(epsilon) is large
df[skewness_arr] = df[skewness_arr].apply(lambda x: np.log(x + 1))
df[df.columns.difference(['diagnosis'])] = preprocessing.StandardScaler().fit_transform(df[df.columns.difference(['diagnosis'])])
area_se 5.447186 concavity_se 5.110463 fractal_dimension_se 3.923969 perimeter_se 3.443615 radius_se 3.088612 smoothness_se 2.314450 symmetry_se 2.195133 dtype: float64
Select features with a large positive skewness to apply log-transform.
After that, standardize every feature (except diagnosis) with StandardScaler().
After that, standardize every feature (except diagnosis) with StandardScaler().
In [403]:
df.hist(bins=15, figsize=(20, 15), layout=(6, 6))
plt.tight_layout()
plt.show()
In [404]:
# Plot the correlation matrix
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', linewidths=0.5).figure.set_size_inches(20, 10)
plt.title('Correlation Matrix')
plt.show()
The eventual diagnosis correlates well with the "size" (perimeter, radius, area) and concavity of the nucleus.
Generally, the dataset seems to be highly correlated, altough there are several irrelevant correlations between features that measure roughly the same thing (mean, worst, se for every feature).
In [405]:
# Plot pairplot for a subset of features
sns.pairplot(
df[["diagnosis", "radius_mean", "texture_mean", "perimeter_mean", "area_mean",
"smoothness_mean", "compactness_mean", "concavity_mean", "concave points_mean", "symmetry_mean",
"fractal_dimension_mean" ]],
hue = "diagnosis",
palette={1: 'orange', 0: 'blue'}
)
plt.show()
Overall, the data is well separated. The only features that are not well-separated are fractal_dimension, symmetry, smoothness and texture.
We can also observe that the larger the nucleus is (measured by the radius, perimeter and area) there is a higher probability of having malignant breast cancer.
Concavity shows similar properties but there are a few outliers.
We can also observe that the larger the nucleus is (measured by the radius, perimeter and area) there is a higher probability of having malignant breast cancer.
Concavity shows similar properties but there are a few outliers.
Training ¶
In general, we will use cross-validation and a Grid/Random search for hyperparameter optimalization.
Neural Networks ¶
In [406]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
mlp = MLPClassifier(random_state=1, max_iter=10000)
param_grid = {
'hidden_layer_sizes': [(8,), (16,), (32,), (16, 16)],
'activation': ['tanh', 'relu'],
'solver': ['adam', 'sgd'],
'learning_rate_init': [0.001, 0.01, 0.1],
'alpha': [0.0001, 0.001, 0.01],
}
stratified_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
estimator=mlp,
param_grid=param_grid,
cv=stratified_kf,
scoring='accuracy',
n_jobs=-1
)
grid_search.fit(X_train, y_train)
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test set accuracy:", test_accuracy)
y_pred = best_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
Best parameters found: {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (32,), 'learning_rate_init': 0.1, 'solver': 'adam'}
Best cross-validation accuracy: 0.9802197802197803
Test set accuracy: 0.9736842105263158
Classification Report:
precision recall f1-score support
0 0.97 0.99 0.98 71
1 0.98 0.95 0.96 43
accuracy 0.97 114
macro avg 0.97 0.97 0.97 114
weighted avg 0.97 0.97 0.97 114
Logistic Regression ¶
In [407]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg.score(X_test, y_test)
logreg.coef_
Out[407]:
array([[ 0.42170575, 0.4597363 , 0.38992846, 0.45428822, 0.07404599,
-0.52730425, 0.83194298, 1.11157693, -0.24781408, -0.06299014,
1.17960835, -0.19665484, 0.16687914, 1.06692987, 0.29099273,
-0.68468889, -0.17897315, 0.37137957, -0.65938838, -0.63642068,
0.74226631, 1.32019565, 0.49342255, 0.71976254, 0.51230787,
0.03009625, 0.98072941, 0.70117948, 1.26855711, 0.19026785]])